COVID-19 Data Visualization

library( "ggpmisc")
## Warning: pakiet 'ggpmisc' został zbudowany w wersji R 4.3.2
## Ładowanie wymaganego pakietu: ggpp
## Warning: pakiet 'ggpp' został zbudowany w wersji R 4.3.2
## Ładowanie wymaganego pakietu: ggplot2
## Warning: pakiet 'ggplot2' został zbudowany w wersji R 4.3.2
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
## 
## Dołączanie pakietu: 'ggpp'
## Następujący obiekt został zakryty z 'package:ggplot2':
## 
##     annotate
## Registered S3 method overwritten by 'ggpmisc':
##   method                  from   
##   as.character.polynomial polynom
library("ggplot2")
library("plotly")
## Warning: pakiet 'plotly' został zbudowany w wersji R 4.3.2
## 
## Dołączanie pakietu: 'plotly'
## Następujący obiekt został zakryty z 'package:ggplot2':
## 
##     last_plot
## Następujący obiekt został zakryty z 'package:stats':
## 
##     filter
## Następujący obiekt został zakryty z 'package:graphics':
## 
##     layout
library(readr)
## Warning: pakiet 'readr' został zbudowany w wersji R 4.3.2
# I am using total_cases.csv dataset. This dataset contains only two columns: date and cases.

total_cases <- read_csv("C:/Users/Martynaa/Desktop/portfolio/analizy_R/total_cases.csv")
## Rows: 64 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): cases
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(total_cases)
## # A tibble: 6 × 2
##   date        cases
##   <date>      <dbl>
## 1 2020-11-21 656305
## 2 2020-11-22 607893
## 3 2020-11-23 547681
## 4 2020-11-24 463730
## 5 2020-11-25 603506
## 6 2020-11-26 589316
# Structure:
str(total_cases)
## spc_tbl_ [64 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date : Date[1:64], format: "2020-11-21" "2020-11-22" ...
##  $ cases: num [1:64] 656305 607893 547681 463730 603506 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   cases = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Summary:
summary(total_cases)
##       date                cases       
##  Min.   :2020-11-21   Min.   :312202  
##  1st Qu.:2020-12-06   1st Qu.:570826  
##  Median :2020-12-22   Median :628794  
##  Mean   :2020-12-22   Mean   :629472  
##  3rd Qu.:2021-01-07   3rd Qu.:694328  
##  Max.   :2021-01-23   Max.   :898893
# Dimensions:
dim(total_cases)
## [1] 64  2
# Interactive chart
first_chart <- ggplot(total_cases, aes(x = date, y = cases)) +
  geom_line(color = "blue") +
  labs(title = "Number of COVID-19 cases",
       x = "Date",
       y = "Sum: confirmed cases") 

# conversion:
ggplotly(first_chart)
# Histogram and density plot
ggplot(total_cases, aes(x=cases)) + geom_histogram(aes(y=..density..), color="black", fill="white") +
  geom_density(color="lightpink", fill="lightpink", alpha=0.4) +
  theme(plot.background = element_rect("white"), panel.background = element_rect("white"), axis.line = element_line("black"), panel.grid.major = element_line(colour = "grey50") ) +
  labs(title = "Histogram and density plot")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# This dataset presents 7 countries and total cases each day 
top7<-read.csv("C:/Users/Martynaa/Desktop/portfolio/analizy_R/top7_02_2020.csv")
head(top7)
##        country       date cum_cases
## 1      Germany 2020-02-18        16
## 2         Iran 2020-02-18         0
## 3        Italy 2020-02-18         3
## 4 Korea, South 2020-02-18        31
## 5        Spain 2020-02-18         2
## 6           US 2020-02-18        13
# Structure: 
str(top7)
## 'data.frame':    2030 obs. of  3 variables:
##  $ country  : chr  "Germany" "Iran" "Italy" "Korea, South" ...
##  $ date     : chr  "2020-02-18" "2020-02-18" "2020-02-18" "2020-02-18" ...
##  $ cum_cases: int  16 0 3 31 2 13 13 13 13 13 ...
# Summarise:
summarise(top7)
## ramka danych z zerową liczbą kolumn oraz 1 wierszem
# Color-coded chart by country
ggplot(data = top7, aes(x = date, y = cum_cases, col = country)) +
  geom_point() +
  scale_y_log10() +
  labs(title = "Color-coded chart by country", x = "Date", y = "Cum_cases") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Transformation introduced infinite values in continuous y-axis

# Infection trend over time 
ggplot(data = top7, aes(x = as.Date(date), y = cum_cases, color = country)) +
  geom_line() +
  labs(title = "Infection trend over time", x = "Date", y = "Cum_sum")

# Comparison of the number of cases on a logarithmic scale, now we can better capture growth proportions.
ggplot(data = top7, aes(x = as.Date(date), y = cum_cases, color = country)) +
  geom_line() +
  scale_y_log10() +
  labs(title = "Comparison of the number of cases on a logarithmic scale", x = "Date", y = "Cum_sum (log)")
## Warning: Transformation introduced infinite values in continuous y-axis

# Number of cases on selected days:  start, middle, end
selected_dates <- top7 %>% filter(date %in% c("2020-02-18", "2020-03-01", "2020-03-15"))
ggplot(data = selected_dates, aes(x = country, y = cum_cases, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~date) +
  labs(title = "Number of cases on selected days", x = "Country", y = "Number of Cases") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Percentage of countries in the global number of cases
total_cases_by_country <- top7 %>%
  group_by(country) %>%
  summarise(total_cases = max(cum_cases, na.rm = TRUE))

total_cases_by_country <- total_cases_by_country %>%
  mutate(percentage = (total_cases / sum(total_cases)) * 100)

ggplot(data = total_cases_by_country, aes(x = reorder(country, -percentage), y = percentage, fill = country)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of countries in the global number of cases", x = "Country", y = "Percentage")

#Heatmap 

ggplot(data = top7, aes(x = as.Date(date), y = country, fill = cum_cases)) +
  geom_tile() +
  labs(title = "Heatmap of cases over time", x = "Date", y = "Country") +
  scale_fill_gradient(low = "white", high = "red")

library(dplyr)
## Warning: pakiet 'dplyr' został zbudowany w wersji R 4.3.2
## 
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union
# This dataset presents countries, theis provinces , date and cases 
data_c <- read_csv("C:/Users/Martynaa/Desktop/portfolio/analizy_R/cases_by_country.csv")
## Rows: 13272 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): country, province
## dbl  (2): cases, cum_cases
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data_c)
## # A tibble: 6 × 5
##   country             province date       cases cum_cases
##   <chr>               <chr>    <date>     <dbl>     <dbl>
## 1 Afghanistan         <NA>     2020-01-22     0         0
## 2 Albania             <NA>     2020-01-22     0         0
## 3 Algeria             <NA>     2020-01-22     0         0
## 4 Andorra             <NA>     2020-01-22     0         0
## 5 Antigua and Barbuda <NA>     2020-01-22     0         0
## 6 Argentina           <NA>     2020-01-22     0         0
# Structure 
str(data_c)
## spc_tbl_ [13,272 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ country  : chr [1:13272] "Afghanistan" "Albania" "Algeria" "Andorra" ...
##  $ province : chr [1:13272] NA NA NA NA ...
##  $ date     : Date[1:13272], format: "2020-01-22" "2020-01-22" ...
##  $ cases    : num [1:13272] 0 0 0 0 0 0 0 0 0 0 ...
##  $ cum_cases: num [1:13272] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   country = col_character(),
##   ..   province = col_character(),
##   ..   date = col_date(format = ""),
##   ..   cases = col_double(),
##   ..   cum_cases = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Summary
summary(data_c)
##    country            province              date                cases         
##  Length:13272       Length:13272       Min.   :2020-01-22   Min.   : -20.000  
##  Class :character   Class :character   1st Qu.:2020-02-04   1st Qu.:   0.000  
##  Mode  :character   Mode  :character   Median :2020-02-18   Median :   0.000  
##                                        Mean   :2020-02-18   Mean   :   8.747  
##                                        3rd Qu.:2020-03-03   3rd Qu.:   0.000  
##                                        Max.   :2020-03-17   Max.   :5198.000  
##    cum_cases      
##  Min.   :    0.0  
##  1st Qu.:    0.0  
##  Median :    2.0  
##  Mean   :  182.5  
##  3rd Qu.:   15.0  
##  Max.   :31506.0
# top 9
top<-data_c %>%  group_by(country) %>% summarise(country=n(), max=max(cum_cases))
top
## # A tibble: 151 × 2
##    country   max
##      <int> <dbl>
##  1      56    22
##  2      56    55
##  3      56    60
##  4      56    39
##  5      56     1
##  6      56    68
##  7      56    78
##  8      56     3
##  9     504   452
## 10      56  1332
## # ℹ 141 more rows
top9<- top %>% arrange(desc(max)) %>% slice_head(n=9) %>% select(country, max)
top9
## # A tibble: 9 × 2
##   country   max
##     <int> <dbl>
## 1      56 31506
## 2      56 16169
## 3      56 11748
## 4      56  9257
## 5      56  8320
## 6     448  7699
## 7    3192  6421
## 8      56  2700
## 9     224  1960
# As we can see top 7 caoutries with a highiest rate of covid cases is in the same countries as in a previous dataset

#Interactive chart for USA provinces 
dane <-data_c %>% filter(date > as.Date("2020-02-20") & country=="US")
dane
## # A tibble: 1,482 × 5
##    country province             date       cases cum_cases
##    <chr>   <chr>                <date>     <dbl>     <dbl>
##  1 US      Alabama              2020-02-21     0        13
##  2 US      Alaska               2020-02-21     0        13
##  3 US      Arizona              2020-02-21     0        13
##  4 US      Arkansas             2020-02-21     0        13
##  5 US      California           2020-02-21     2        15
##  6 US      Colorado             2020-02-21     0        15
##  7 US      Connecticut          2020-02-21     0        15
##  8 US      Delaware             2020-02-21     0        15
##  9 US      Diamond Princess     2020-02-21     0        15
## 10 US      District of Columbia 2020-02-21     0        15
## # ℹ 1,472 more rows
plot_ly(data=dane, x=~date, y=~cases , color = ~province, colors = RColorBrewer::brewer.pal(8, "Set2"), type="scatter", mode="markers")